library(tidyverse)
library(plotly)
library(ggplot2)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(scales)
library(vcd)
library(tidyverse)
library(htmltools)
In this chapter, we analyze the relationship between locations and grades. Our analysis focuses exclusively on data where the grades are A, B, or C, and excludes records with grades labeled as N (Not Yet Graded), Z (Grade Pending), or P (Grade Pending issued upon reopening after a closure). To ensure accuracy and relevance, we begin by filtering out unnecessary data from the dataset before proceeding with the analysis.
# The same data cleaning process in demographics part, also only consider data with grade A, B, C.
manhattan_data = read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
str (manhattan_data)
## spc_tbl_ [94,616 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ CAMIS : num [1:94616] 50140436 50158081 50152703 50160975 50161087 ...
## $ DBA : chr [1:94616] "JUST SALAD" "THE MANNER" "MIDNIGHT BLUE" "BLUE BLOSSOM" ...
## $ BORO : chr [1:94616] "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
## $ BUILDING : chr [1:94616] "2853" "58" "106" "108" ...
## $ STREET : chr [1:94616] "BROADWAY" "THOMPSON STREET" "EAST 19 STREET" "WEST 39 STREET" ...
## $ ZIPCODE : num [1:94616] 10025 10012 10003 10018 10025 ...
## $ PHONE : num [1:94616] 7.32e+09 9.17e+09 3.48e+09 6.47e+09 9.29e+09 ...
## $ CUISINE DESCRIPTION : chr [1:94616] NA NA NA NA ...
## $ INSPECTION DATE : chr [1:94616] "01/01/1900" "01/01/1900" "01/01/1900" "01/01/1900" ...
## $ ACTION : chr [1:94616] NA NA NA NA ...
## $ VIOLATION CODE : chr [1:94616] NA NA NA NA ...
## $ VIOLATION DESCRIPTION: chr [1:94616] NA NA NA NA ...
## $ CRITICAL FLAG : chr [1:94616] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ SCORE : num [1:94616] NA NA NA NA NA NA NA NA NA NA ...
## $ GRADE : chr [1:94616] NA NA NA NA ...
## $ GRADE DATE : chr [1:94616] NA NA NA NA ...
## $ RECORD DATE : chr [1:94616] "11/05/2024" "11/05/2024" "11/05/2024" "11/05/2024" ...
## $ INSPECTION TYPE : chr [1:94616] NA NA NA NA ...
## $ Latitude : num [1:94616] 40.8 40.7 40.7 40.8 40.8 ...
## $ Longitude : num [1:94616] -74 -74 -74 -74 -74 ...
## $ Community Board : num [1:94616] 109 102 105 105 107 108 101 105 104 102 ...
## $ Council District : chr [1:94616] "07" "01" "02" "04" ...
## $ Census Tract : chr [1:94616] "019900" "004700" "005000" "011300" ...
## $ BIN : num [1:94616] 1075440 1087362 1017905 1015273 1055676 ...
## $ BBL : num [1:94616] 1.02e+09 1.00e+09 1.01e+09 1.01e+09 1.02e+09 ...
## $ NTA : chr [1:94616] "MN09" "MN24" "MN21" "MN17" ...
## $ Location Point1 : logi [1:94616] NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. CAMIS = col_double(),
## .. DBA = col_character(),
## .. BORO = col_character(),
## .. BUILDING = col_character(),
## .. STREET = col_character(),
## .. ZIPCODE = col_double(),
## .. PHONE = col_double(),
## .. `CUISINE DESCRIPTION` = col_character(),
## .. `INSPECTION DATE` = col_character(),
## .. ACTION = col_character(),
## .. `VIOLATION CODE` = col_character(),
## .. `VIOLATION DESCRIPTION` = col_character(),
## .. `CRITICAL FLAG` = col_character(),
## .. SCORE = col_double(),
## .. GRADE = col_character(),
## .. `GRADE DATE` = col_character(),
## .. `RECORD DATE` = col_character(),
## .. `INSPECTION TYPE` = col_character(),
## .. Latitude = col_double(),
## .. Longitude = col_double(),
## .. `Community Board` = col_double(),
## .. `Council District` = col_character(),
## .. `Census Tract` = col_character(),
## .. BIN = col_double(),
## .. BBL = col_double(),
## .. NTA = col_character(),
## .. `Location Point1` = col_logical()
## .. )
## - attr(*, "problems")=<externalptr>
zip_grade = manhattan_data %>%
janitor::clean_names() %>%
filter(
!is.na(dba),
!is.na(cuisine_description),
!is.na(grade),
!is.na(score),
!is.na(zipcode),
grade %in% c("A", "B", "C")
) %>% mutate(region = case_when(
zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
TRUE ~ "Other" # For ZIP codes like 11371, 12345, etc.
))
To investigate if where restaurants are located influence their inspection grades, we produce Chi-Square test. The result is presented in the Contingency Table Heatmap below. The p-value is less than 0.05, leading us to reject the null hypothesis that the region and grade are independent. This indicates a there a significant difference in the distribution of grades among regions at the 0.05 significance level.
# Create a contingency table of zip_code vs. grade
contingency_table <- table(zip_grade$region, zip_grade$grade)
# Perform Chi-Square Test
chi_test <- chisq.test(contingency_table)
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Convert the contingency table to a matrix
contingency_matrix <- as.matrix(contingency_table)
contingency_matrix <- contingency_matrix[order(contingency_matrix[, "A"], decreasing = TRUE), ]
# Extract the residuals (contributions to the Chi-Square statistic)
residuals_matrix <- chi_test$residuals
# Extract chi-square test statistic and p-value
chi_stat <- round(chi_test$statistic, 2)
chi_pval <- format.pval(chi_test$p.value, digits = 3)
# Add the chi-square test result as annotations
plot_ly(
x = colnames(contingency_matrix),
y = rownames(contingency_matrix),
z = contingency_matrix,
type = "heatmap",
colors = colorRamp(c("white", "purple"))
) %>%
layout(
title = "Contingency Table Heatmap with Chi-Square Result",
xaxis = list(title = "Grade"),
yaxis = list(title = "Region"),
colorbar = list(title = "Count"),
annotations = list(
list(
x = 2.1,
y = 3,
text = paste("Chi-Square Statistic:", chi_stat, "<br>P-Value:", chi_pval),
showarrow = FALSE,
font = list(size = 14)
)
)
)
## Warning: 'layout' objects don't have these attributes: 'colorbar'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'
To investigate how strong the association is between located and
grades, we produce Cramér’s V test. The Cramér’s V Value is 0.123, which
is larger than 0.1 (Weak association) and smaller than 0.3 (Moderate
association), so that we conclude they have weak association.
This
conclusion is visualized in theree pie charts below of different regions
that we can’t tell large difference among them.
# Calculate Cramér’s V
cramer_v <- assocstats(contingency_table)$cramer
# Print Cramér’s V
print(paste("Cramér's V:", round(cramer_v, 3)))
## [1] "Cramér's V: 0.027"
# uptown visualization
uptown <- zip_grade %>%
filter(region == "Uptown") %>%
group_by(region, grade) %>%
summarize(count = n(), .groups = "drop")
uptown_pie = plot_ly(
data = uptown,
labels = ~grade, # Grades (A, B, C)
values = ~count, # Counts of each grade
type = 'pie',
textinfo = 'label+percent', # Display labels and percentages
hoverinfo = 'label+value+percent' , # Show additional info on hover
colors = "viridis"
) %>%
layout(
title = "Uptown" ,
legend = list(title = list(text = "Grade"))
)
# Midtown visualization
midtown <- zip_grade %>%
filter(region == "Midtown") %>%
group_by(region, grade) %>%
summarize(count = n(), .groups = "drop")
midtown_pie = plot_ly(
data = midtown,
labels = ~grade, # Grades (A, B, C)
values = ~count, # Counts of each grade
type = 'pie',
textinfo = 'label+percent', # Display labels and percentages
hoverinfo = 'label+value+percent' , # Show additional info on hover
colors = "viridis"
) %>%
layout(
title = "Midtown" ,
legend = list(title = list(text = "Grade"))
)
# Downtown visualization
downtown <- zip_grade %>%
filter(region == "Downtown") %>%
group_by(region, grade) %>%
summarize(count = n(), .groups = "drop")
downtown_pie = plot_ly(
data = downtown,
labels = ~grade, # Grades (A, B, C)
values = ~count, # Counts of each grade
type = 'pie',
textinfo = 'label+percent', # Display labels and percentages
hoverinfo = 'label+value+percent' , # Show additional info on hover
colors = "viridis"
) %>%
layout(
title = "Downtown" ,
legend = list(title = list(text = "Grade"))
)
# Combine the pie charts into one layout
doc <- htmltools::tagList(
div(uptown_pie, style = "float:left;width:33.3%;"),
div(midtown_pie, style = "float:left;width:33.3%;"),
div(downtown_pie, style = "float:right;width:33.3%;")
)
htmltools::browsable(doc)